In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("house_data.csv")
df.info() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long           21613 non-null  float64
 19  sqft_living15  21613 non-null  int64  
 20  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB

Split dataset into half.

In [2]:
splitting_int = int(len(df)*0.5)
df1 = df.iloc[:splitting_int, :]
df2 = df.iloc[splitting_int:,:]
print(df1.shape)
print(df2.shape)
(10806, 21)
(10807, 21)

Elbow Plot To Determine Optimal Number of Clusters

In [3]:
from sklearn.cluster import KMeans 
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

def elbow_plot(data=df1[['price','sqft_living']]):
    score = []
    for cluster in range(1,10):
        kmeans = KMeans(n_clusters = cluster, init="k-means++", random_state=10)
        kmeans.fit(data)
        score.append(kmeans.inertia_)
    plt.bar(range(1,10), score)
    plt.title('The Elbow Method')
    plt.xlabel('no of clusters')
    plt.ylabel('wcss')
    plt.grid()
    return plt.show()

elbow_plot()
In [4]:
scaler = MinMaxScaler()
df1[['price','sqft_living']] = scaler.fit_transform(df1[['price','sqft_living']])
#Apply kmeans clustering to the entire dataset
kmeans = KMeans(n_clusters = 5, random_state = 1000).fit(df1[['price','sqft_living']])
df1['cluster'] = kmeans.labels_
df1[['price','sqft_living']] = scaler.inverse_transform(df1[['price','sqft_living']])
In [5]:
import pickle

pickle.dump(kmeans, open("save.pkl", "wb"))
km = pickle.load(open("save.pkl", "rb"))
In [6]:
scaler2 = MinMaxScaler()
df2[['price','sqft_living']] = scaler.fit_transform(df2[['price','sqft_living']])
km.fit(df2[['price','sqft_living']])
df2['cluster'] = km.labels_
df2[['price','sqft_living']] = scaler.inverse_transform(df2[['price','sqft_living']])
In [7]:
def cluster_summary(data):
    #Descriptive analysis of clusters of a summary
    data=data.drop(['id','lat','long','yr_built','yr_renovated','zipcode'],axis=1)
    count = data.groupby('cluster')['cluster'].count()
    # find cluster averages
    cluster_means = np.round(data.groupby(['cluster']).mean(),decimals=3)
    cluster_means['Count'] = count    
    return np.transpose(cluster_means)
In [8]:
cluster_summary(df1)
Out[8]:
cluster 0 1 2 3 4
price 716065.599 345829.823 1293745.201 473567.978 3483673.469
bedrooms 3.973 2.766 4.206 3.499 4.490
bathrooms 2.610 1.475 3.284 2.148 4.449
sqft_living 2918.798 1253.197 4152.578 2029.206 6343.061
sqft_lot 23137.353 10665.438 40427.673 13586.614 26605.327
floors 1.689 1.224 1.833 1.442 1.878
waterfront 0.011 0.002 0.038 0.003 0.306
view 0.421 0.072 1.012 0.170 2.184
condition 3.407 3.455 3.372 3.471 3.429
grade 8.514 6.802 9.869 7.523 11.347
sqft_above 2463.785 1162.724 3416.388 1675.860 4821.224
sqft_basement 455.013 90.474 736.190 353.347 1521.837
sqft_living15 2533.931 1499.853 3245.339 1948.689 3627.347
sqft_lot15 18666.349 9412.748 27713.528 11931.144 17174.000
Count 2157.000 4030.000 578.000 3992.000 49.000
In [9]:
cluster_summary(df2)
Out[9]:
cluster 0 1 2 3 4
price 799701.248 330271.843 2619619.048 1441251.938 508907.394
bedrooms 3.972 2.859 4.410 4.167 3.570
bathrooms 2.756 1.668 3.905 3.253 2.282
sqft_living 3114.635 1351.959 5057.571 3914.517 2194.658
sqft_lot 21757.013 9954.289 24985.810 24919.535 14187.690
floors 1.789 1.391 1.905 1.856 1.588
waterfront 0.008 0.002 0.200 0.045 0.004
view 0.429 0.059 1.943 1.097 0.175
condition 3.336 3.375 3.438 3.386 3.381
grade 8.837 6.937 10.819 9.870 7.730
sqft_above 2677.737 1237.687 4076.467 3246.329 1862.945
sqft_basement 436.898 114.271 981.105 668.188 331.714
sqft_living15 2660.778 1526.013 3506.600 3090.355 2047.697
sqft_lot15 16942.055 9157.346 19869.705 20164.754 12603.634
Count 1803.000 4473.000 105.000 516.000 3910.000

Scatterplot Showing Segmentation of Price against Sqft_Living

In [10]:
import plotly.express as px

hoverdata_list=['floors','grade','view','waterfront','lat','long','yr_built']
#Plot a plotly interactive scatter plot for df1
fig = px.scatter(df1, y='price', x='sqft_living',
                color='cluster', render_mode='svg', template='plotly',
                hover_data=hoverdata_list,
                hover_name='id')
fig.show()
In [11]:
#Plot a plotly interactive scatter plot for df2
fig = px.scatter(df2, y='price', x='sqft_living',
                color='cluster', render_mode='svg', template='plotly',
                hover_data=hoverdata_list,
                hover_name='id')
fig.show()

Geographic Dispersion of Clusters

In [12]:
fig = px.scatter_mapbox(df1, lat="lat", lon="long", color="cluster",hover_name='id',hover_data=hoverdata_list,
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
                  mapbox_style="carto-positron")
fig.show()
In [13]:
fig = px.scatter_mapbox(df2, lat="lat", lon="long", color="yr_built", hover_name='id',hover_data=hoverdata_list,
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
                  mapbox_style="carto-positron")
fig.show()
In [14]:
fig = px.scatter_mapbox(df2, lat="lat", lon="long", color="grade", hover_name='id',hover_data=hoverdata_list,
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
                  mapbox_style="carto-positron")
fig.show()

How Price Is Affected By Amenities & Can Be Used For Clustering

In [15]:
fig = px.scatter_ternary(df1, a="bedrooms", b="bathrooms", c="floors",color='cluster',hover_name='id',size='price')
fig.show()
In [16]:
fig = px.scatter_ternary(df2, a="bedrooms", b="bathrooms", c="floors",color='cluster',hover_name='id',size='price')
fig.show()